import pandas as pd  
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from sklearn.cluster import KMeans  
import pickle
  
# Load the dataset  
credit_customers = pd.read_csv("credit_customers.csv")  
  
# Extract the important columns  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']  
data_for_clustering = credit_customers[important_columns].copy()  
  
# Preprocess the data by applying Label Encoding to 'savings_status' and 'employment'  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])  
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])  
  
# Apply One-Hot Encoding to 'credit_history'  
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)  
  
# Normalize the data using Standard Scaling  
data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)  
  
# Perform K-means clustering with 4 clusters  
kmeans = KMeans(n_clusters=4, random_state=42)  
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)  
  
# Add the cluster labels to the original dataset  
credit_customers['cluster'] = cluster_labels  
  
# Define the target customer segments  
target_customer_segments = [1, 2]  # Replace this list with the target customer segments from the previous step  
  
# Summarize the key characteristics of each target customer segment  
summary = {}  
for segment in target_customer_segments:  
    segment_customers = credit_customers[credit_customers['cluster'] == segment]  
    summary[segment] = {  
        'average_age': segment_customers['age'].mean(),  
        'average_credit_amount': segment_customers['credit_amount'].mean(),  
        'employment_distribution': segment_customers['employment'].value_counts(normalize=True).to_dict(),  
        'savings_status_distribution': segment_customers['savings_status'].value_counts(normalize=True).to_dict(),  
        'credit_history_distribution': segment_customers['credit_history'].value_counts(normalize=True).to_dict()  
    }  
  
# Return the summary of key characteristics for each target customer segment     
print("summary:\n", summary)  
pickle.dump(summary,open("./ref_result/summary.pkl","wb"))
